import argparse
import json
import os
import cv2
import numpy as np
from tqdm import tqdm
import base64
import os, json
import sys
import pandas as pd
llava_path = os.path.dirname(os.path.dirname(__file__))
if llava_path not in sys.path:
    sys.path.append(llava_path)
    print(sys.path)


def img_from_base64(imagestring):
    try:
        jpgbytestring = base64.b64decode(imagestring)
        nparr = np.frombuffer(jpgbytestring, np.uint8)
        r = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
        return r
    except ValueError:
        return None

def get_image(img_tsv, idx): 
    row = img_tsv[idx]
    key = row[0]
    cv2_im = img_from_base64(row[-1])
    return key, cv2_im

def get_instruct(caption_tsv, idx): 
    row = caption_tsv[idx]
    key = row[0]
    text = json.loads(row[-1])[0]
    question = text['question']
    answer = text['answer']
    return key, question, answer


def cli():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir', type=str, default='./blob_dir/debug_output/llava/data/mmbench')
    parser.add_argument('--input_tsv', type=str, default='mmbench_dev_20230712.tsv')
    
    args = parser.parse_args()
    output_image_dir = '{}/image'.format(args.data_dir)
    os.makedirs(output_image_dir, exist_ok=True)
    return args


def main():
    args = cli()
    tsv_data = pd.read_tsv(os.path.join(args.data_dir, args.input_tsv), header=0)
    print(len(tsv_data))
    output_json = {}
    instruct_list = []
    '''
    header
    ['index', 'question', 'hint', 'A', 'B', 'C', 'D', 'answer', 'category', 'image', 'source', 'l2-category', 'comment', 'split']
    '''
    for row_ind, row in tsv_data.iterrows():
        img_sample = row['image']
        img_fname = '{}/image/{}.jpg'.format(args.data_dir, row['index'])
        if not os.path.exists(img_fname):
            cv2_im = img_from_base64(img_sample[-1])
            cv2.imwrite(img_fname, cv2_im)
        text_data = {k: v for k,v in row.items() if k != 'image'}
        text_data['image_id'] = text_data['index']
        instruct_list.append(text_data)
    output_json['annotations'] = instruct_list

    total_sample = len(instruct_list)
    print('total instruct-answer pairs:', total_sample)
    json_fname = '{}/{}.json'.format(args.data_dir, args.input_tsv)
    with open(json_fname, 'w') as f:
        json.dump(output_json, f)

if __name__ == '__main__':
    main()
